#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')
library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(WGCNA)
library(expss)
library(polycor)
library(knitr)
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
clustering_selected = 'DynamicHybrid'
print(paste0('Using clustering ', clustering_selected))
## [1] "Using clustering DynamicHybrid"
Load preprocessed dataset (preprocessing code in 19_10_14_data_preprocessing.Rmd) and clustering (pipeline in 19_10_15_WGCNA.Rmd)
# Gandal dataset
load('./../Data/Gandal/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame
# GO Neuronal annotations
GO_annotations = read.csv('./../../../PhD-InitialExperiments/FirstYearReview/Data/GO_annotations/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_08-29-2019_with_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>%
mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
left_join(GO_neuronal, by='ID') %>%
mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`),
significant=padj<0.05 & !is.na(padj))
# Dataset created with DynamicTreeMerged algorithm
dataset = read.csv(paste0('./../Data/Gandal/dataset_', clustering_selected, '.csv'))
dataset$Module = dataset[,clustering_selected]
# GO DEA
# load('./../Data/Gandal/GO_DE_clusters.RData')
rm(DE_info, GO_annotations, clusterings)
print(paste0('There are ', length(unique(dataset$Module)),' modules:'))
## [1] "There are 62 modules:"
dataset$Module %>% table %>% sort(decreasing=TRUE) %>% print
## .
## #00C0BF #8DAB00 #F066EA #9AA800 #F07F4A #6B9AFF #E96AF1 #FE61D0 #8594FF
## 1298 1277 1158 1149 1086 639 614 566 441
## #00B2F3 #FF67A3 #00BF78 #00C085 #00C092 #E58702 #D675FD #00C19E #00C1AA
## 395 366 328 327 312 295 291 277 259
## #CA9700 #00B9E4 #D19300 #E06FF8 #D88F00 #9B8EFF #43B500 #00B6EC #FF6A97
## 259 254 247 239 217 215 205 197 182
## gray #6FB000 #AFA200 #FE6E8A #C29B00 #A5A500 #00AAFE #00AEF9 #AD87FF
## 178 176 175 168 166 160 158 158 131
## #FB727C #00BC59 #00B930 #FA62D9 #00BE69 #DF8B00 #F47A5D #CA7BFF #00BDD3
## 128 125 123 120 115 114 106 99 97
## #48A0FF #00BEC9 #00C0B5 #13B700 #F8766D #BD81FF #B99E00 #00BBDC #00BB47
## 97 95 94 84 82 72 62 57 52
## #F663E2 #FF61C5 #7FAE00 #FF62BB #EB8332 #FF64AF #00A5FF #5CB300
## 45 45 42 41 40 38 32 32
datTraits = datMeta %>% dplyr::select(Diagnosis_, Region, Sex, Age, PMI, RNAExtractionBatch) %>%
rename('Diagnosis' = Diagnosis_, 'ExtractionBatch' = RNAExtractionBatch)
# Recalculate MEs with color labels
ME_object = datExpr %>% t %>% moduleEigengenes(colors = dataset$Module)
MEs = orderMEs(ME_object$eigengenes)
# Calculate correlation between eigengenes and the traits and their p-values
moduleTraitCor = MEs %>% apply(2, function(x) hetcor(x, datTraits, ML=TRUE)$correlations[1,-1]) %>% t
rownames(moduleTraitCor) = colnames(MEs)
colnames(moduleTraitCor) = colnames(datTraits)
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nrow(datExpr))
# In case there are any NAs
if(sum(!complete.cases(moduleTraitCor))>0){
print(paste0(sum(is.na(moduleTraitCor)),' correlation(s) could not be calculated'))
}
## [1] "1 correlation(s) could not be calculated"
moduleTraitCor = moduleTraitCor[complete.cases(moduleTraitCor),]
moduleTraitPvalue = moduleTraitPvalue[complete.cases(moduleTraitCor),]
# Sort moduleTraitCor by Diagnosis
moduleTraitCor = moduleTraitCor[order(moduleTraitCor[,1], decreasing=TRUE),]
moduleTraitPvalue = moduleTraitPvalue[order(moduleTraitCor[,1], decreasing=TRUE),]
# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)
labeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits), yLabels = gsub('ME','',rownames(moduleTraitCor)),
yColorWidth=0, colors = brewer.pal(11,'PiYG'), bg.lab.y = gsub('ME','',rownames(moduleTraitCor)),
textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.8, cex.lab.y = 0.75, zlim = c(-1,1),
main = paste('Module-Trait relationships'))
rm(ME_object, datTraits, MEs, moduleTraitCor, moduleTraitPvalue, textMatrix)
Selecting the three modules with highest (absolute) correlation to Diagnosis: #D675FD, #00BF78 and #13B700
pca = datExpr %>% prcomp
plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
left_join(dataset, by='ID') %>% dplyr::select(ID, PC1, PC2, Module, gene.score) %>%
mutate(ImportantModules = ifelse(Module=='#D675FD', '#D675FD',
ifelse(Module=='#00BF78', '#00BF78',
ifelse(Module=='#13B700', '#13B700', 'Others')))) %>%
mutate(color = ifelse(ImportantModules=='Others','gray',ImportantModules),
alpha = ifelse(ImportantModules=='Others', 0.1, 0.5))
table(plot_data$ImportantModules)
##
## #00BF78 #13B700 #D675FD Others
## 328 84 291 15897
ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=ImportantModules)) +
geom_point(alpha=plot_data$alpha, color=plot_data$color) + theme_minimal() +
ggtitle('Modules with strongest relation to Diagnosis'))
Comparing this with the DEA plot, Gene significance close to 0 means the gene expression is not affected by Diagnosis and negative values means the gene is underexpressed in Autism samples. So the important thing about gene significance is its absolute value
plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
left_join(dataset, by='ID')
ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=GS)) + geom_point(alpha=0.5) +
scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) +
theme_minimal() + ggtitle('Gene Significance'))
Module with the highest (absolute) correlation to Diagnosis (-0.93)
plot_data = dataset %>% dplyr::select(MM.D675FD, GS, gene.score) %>% filter(dataset$Module=='#D675FD')
ggplotly(plot_data %>% ggplot(aes(MM.D675FD, GS, color=gene.score)) + geom_point(alpha=0.5) +
scale_color_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) +
theme_minimal())
Module with the second highest (absolute) correlation to Diagnosis (0.91)
plot_data = dataset %>% dplyr::select(MM.00BF78, GS, gene.score) %>% filter(dataset$Module=='#00BF78')
ggplotly(plot_data %>% ggplot(aes(MM.00BF78, GS, color=gene.score)) + geom_point(alpha=0.5) +
scale_color_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) +
theme_minimal())
Module with the third highest (absolute) correlation to Diagnosis and the top one with positive correlation (0.91)
plot_data = dataset %>% dplyr::select(MM.13B700, GS, gene.score) %>% filter(dataset$Module=='#13B700')
ggplotly(plot_data %>% ggplot(aes(MM.13B700, GS, color=gene.score)) + geom_point(alpha=0.5) +
scale_color_manual(values=SFARI_colour_hue(r=c(2:6,8,7))) +
theme_minimal())
Selecting the modules with the highest correlation to Diagnosis, and, from them, the genes with the highest module membership-(absolute) gene significance
*Ordered by \(\frac{MM+GS}{2}\)
Only three SFARI Genes in the three lists …
top_genes_1 = dataset %>% dplyr::select(ID, MM.D675FD, GS, gene.score) %>% filter(dataset$Module=='#D675FD') %>%
rename('MM' = 'MM.D675FD') %>% mutate(importance = (MM+abs(GS))/2) %>% arrange(by=-importance) %>%
top_n(10)
kable(top_genes_1, caption='Top 10 genes for module #D675FD')
| ID | MM | GS | gene.score | importance |
|---|---|---|---|---|
| ENSG00000153046 | 0.7905592 | 0.3254951 | None | 0.5580272 |
| ENSG00000160908 | 0.7240197 | 0.3513986 | None | 0.5377092 |
| ENSG00000159256 | 0.7536605 | 0.3007044 | None | 0.5271825 |
| ENSG00000089177 | 0.7058893 | 0.3184176 | None | 0.5121535 |
| ENSG00000180376 | 0.6530201 | -0.3474614 | None | 0.5002408 |
| ENSG00000184863 | 0.6611401 | -0.3284024 | None | 0.4947713 |
| ENSG00000079841 | 0.8338278 | -0.1502685 | 2 | 0.4920481 |
| ENSG00000244754 | 0.8556961 | 0.1265332 | None | 0.4911146 |
| ENSG00000091428 | 0.8103045 | -0.1444815 | 4 | 0.4773930 |
| ENSG00000112701 | 0.8783071 | 0.0739720 | None | 0.4761395 |
top_genes_2 = dataset %>% dplyr::select(ID, MM.00BF78, GS, gene.score) %>% filter(dataset$Module=='#00BF78') %>%
rename('MM' = 'MM.00BF78') %>% mutate(importance = (MM+abs(GS))/2) %>% arrange(by=-importance) %>%
top_n(10)
kable(top_genes_2, caption='Top 10 genes for module #00BF78')
| ID | MM | GS | gene.score | importance |
|---|---|---|---|---|
| ENSG00000147419 | 0.8060082 | -0.9461082 | None | 0.8760582 |
| ENSG00000134108 | 0.8092190 | -0.8920351 | None | 0.8506270 |
| ENSG00000150768 | 0.8523198 | -0.8412765 | None | 0.8467981 |
| ENSG00000134440 | 0.8198250 | -0.8361057 | None | 0.8279653 |
| ENSG00000134265 | 0.7778789 | -0.8478985 | None | 0.8128887 |
| ENSG00000100934 | 0.7329018 | -0.8317850 | None | 0.7823434 |
| ENSG00000075303 | 0.7528216 | -0.7943756 | None | 0.7735986 |
| ENSG00000091140 | 0.7705895 | -0.7718465 | None | 0.7712180 |
| ENSG00000138138 | 0.7461832 | -0.7841764 | None | 0.7651798 |
| ENSG00000213585 | 0.7658138 | -0.7458718 | None | 0.7558428 |
top_genes_3 = dataset %>% dplyr::select(ID, MM.13B700, GS, gene.score) %>% filter(dataset$Module=='#13B700') %>%
rename('MM' = 'MM.13B700') %>% mutate(importance = (MM+abs(GS))/2) %>% arrange(by=-importance) %>%
top_n(10)
kable(top_genes_3, caption='Top 10 genes for module #13B700')
| ID | MM | GS | gene.score | importance |
|---|---|---|---|---|
| ENSG00000164209 | 0.8630768 | -0.6128660 | None | 0.7379714 |
| ENSG00000075790 | 0.7585227 | -0.6298825 | None | 0.6942026 |
| ENSG00000196950 | 0.7729037 | -0.6077678 | None | 0.6903357 |
| ENSG00000108946 | 0.8154412 | -0.5593705 | None | 0.6874058 |
| ENSG00000004897 | 0.7401680 | -0.6063262 | None | 0.6732471 |
| ENSG00000100614 | 0.7651184 | -0.5325986 | None | 0.6488585 |
| ENSG00000107679 | 0.6441001 | -0.6004142 | None | 0.6222572 |
| ENSG00000009844 | 0.7553578 | -0.4852143 | None | 0.6202860 |
| ENSG00000116918 | 0.6690866 | -0.5708086 | None | 0.6199476 |
| ENSG00000165832 | 0.7841533 | -0.4526986 | None | 0.6184260 |
plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
left_join(dataset, by='ID') %>% dplyr::select(ID, PC1, PC2, Module, gene.score) %>%
mutate(color = ifelse(Module=='#D675FD', '#D675FD',
ifelse(Module=='#00BF78', '#00BF78',
ifelse(Module=='#13B700', '#13B700', 'gray')))) %>%
mutate(alpha = ifelse(color %in% c('#D675FD', '#00BF78', '#13B700') &
ID %in% c(as.character(top_genes_1$ID),
as.character(top_genes_2$ID),
as.character(top_genes_3$ID)), 1, 0.1))
plot_data %>% ggplot(aes(PC1, PC2)) + geom_point(alpha=plot_data$alpha, color=plot_data$color) +
theme_minimal() + ggtitle('Important genes identified through WGCNA')
Modules with the strongest module-diagnosis correlation should have the highest percentage of SFARI Genes, but the plot shows this relation in modules with correlations close to 0, and the behaviour is the same for the other clusterings I have tried…
A good thing is that the gray cluster (which is actually all the genes that were left without a cluster) has a low correlation with ASD and also a low percentage of SFARI genes
plot_data = dataset %>% mutate('hasSFARIscore' = gene.score!='None') %>%
group_by(Module, MTcor, hasSFARIscore) %>% summarise(p=n()) %>%
left_join(dataset %>% group_by(Module) %>% summarise(n=n()), by='Module') %>%
mutate(p=round(p/n*100,2))
for(i in 1:nrow(plot_data)){
this_row = plot_data[i,]
if(this_row$hasSFARIscore==FALSE & this_row$p==100){
new_row = this_row
new_row$hasSFARIscore = TRUE
new_row$p = 0
plot_data = plot_data %>% rbind(new_row)
}
}
plot_data = plot_data %>% filter(hasSFARIscore==TRUE)
ggplotly(plot_data %>% ggplot(aes(MTcor, p, size=n)) + geom_smooth(color='gray', se=FALSE) +
geom_point(color=plot_data$Module, aes(id=Module)) +
xlab('Module-Diagnosis correlation') + ylab('% of SFARI genes') +
theme_minimal() + theme(legend.position = 'none'))
rm(i, this_row, new_row)